knitr::opts_chunk$set(echo = TRUE)
library(fitdistrplus)
## Loading required package: MASS
## Loading required package: survival
library(logspline)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:GenomicRanges':
##
## subtract
library(pheatmap)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ IRanges::collapse() masks dplyr::collapse()
## ✖ Biobase::combine() masks BiocGenerics::combine(), dplyr::combine()
## ✖ matrixStats::count() masks dplyr::count()
## ✖ IRanges::desc() masks dplyr::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ S4Vectors::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ S4Vectors::rename() masks dplyr::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ IRanges::slice() masks dplyr::slice()
## ✖ magrittr::subtract() masks GenomicRanges::subtract()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mnormt)
library(nlcor)
library(RColorBrewer)
gencode.vM33.All.GeneLengths <- read.table(file="gencode.vM33.All.GeneLengths.txt",sep="\t",col.names = c("GeneID","GeneLength"))
# Convert to Numeric
gencode.vM33.All.GeneLengths <- gencode.vM33.All.GeneLengths %>%
mutate(across(-1, as.numeric))
row.names(gencode.vM33.All.GeneLengths) <- sub("\\..*", "", gencode.vM33.All.GeneLengths$GeneID)
hist(gencode.vM33.All.GeneLengths$GeneLength)
gencode.vM33.All.GeneLengths$log2.GeneLength <- log2(gencode.vM33.All.GeneLengths$GeneLength) + 1
par(mar = c(5, 5, 2, 2))
hist(gencode.vM33.All.GeneLengths$log2.GeneLength, freq = FALSE)
lines(density(gencode.vM33.All.GeneLengths$log2.GeneLength))
par(mar = c(5, 5, 2, 2))
descdist(gencode.vM33.All.GeneLengths$log2.GeneLength, discrete = FALSE)
## summary statistics
## ------
## min: 4.321928 max: 22.4976
## median: 12.66133
## mean: 12.66346
## estimated sd: 3.081378
## estimated skewness: 0.06684799
## estimated kurtosis: 2.206873
fit.norm <- fitdist(gencode.vM33.All.GeneLengths$log2.GeneLength, "norm")
par(mar = c(5, 5, 2, 2))
plot(fit.norm)
fit.weibull <- fitdist(gencode.vM33.All.GeneLengths$log2.GeneLength, "weibull")
par(mar = c(5, 5, 2, 2))
plot(fit.weibull)
fit.lognorm <- fitdist(gencode.vM33.All.GeneLengths$log2.GeneLength, "lnorm")
par(mar = c(5, 5, 2, 2))
plot(fit.lognorm)
fit.norm$aic
## [1] 289754.7
fit.weibull$aic
## [1] 289708.8
fit.lognorm$aic
## [1] 291502.9
Choose top and bottom 5% genes and median genes in our DESeq datasets for sampling.
PDDo <- "/Users/mar/Library/CloudStorage/GoogleDrive-mohitharikatla.education@gmail.com/My Drive/RR9_MurineSequencingData/Thesis/02_rr9/00.1_Overlap_DESeq/PostDESeq_overlap.RData"
load(PDDo)
gencode.vM33.All.GeneLengths.filtered <- gencode.vM33.All.GeneLengths[Overlap_names,]
median_value <- median(gencode.vM33.All.GeneLengths.filtered$log2.GeneLength)
# Calculate the absolute differences between each data point and the median
gencode.vM33.All.GeneLengths.filtered$ab_diff <- abs(gencode.vM33.All.GeneLengths.filtered$log2.GeneLength - median_value)
median_500_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$ab_diff)[1:500],]
top_500_genes <- gencode.vM33.All.GeneLengths.filtered[order(-gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:500],]
bottom_500_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:500],]
sample_genes <- list(
median500 = median_500_genes,
top500 = top_500_genes,
bottom500 = bottom_500_genes
)
ONT_median.ds <- ONT_DESeq_overlap.ds[row.names(median_500_genes), ]
Illumina_median.ds <- Illumina_DESeq_overlap.ds[row.names(median_500_genes), ]
ONT_top.ds <- ONT_DESeq_overlap.ds[row.names(top_500_genes), ]
Illumina_top.ds <- Illumina_DESeq_overlap.ds[row.names(top_500_genes), ]
ONT_bottom.ds <- ONT_DESeq_overlap.ds[row.names(bottom_500_genes), ]
Illumina_bottom.ds <- Illumina_DESeq_overlap.ds[row.names(bottom_500_genes), ]
ONT_median.rlog <- rlog(ONT_median.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
Illumina_median.rlog <- rlog(Illumina_median.ds,blind=TRUE)
plotPCA(ONT_median.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
plotPCA(Illumina_median.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
ONT_top.rlog <- rlog(ONT_top.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
Illumina_top.rlog <- rlog(Illumina_top.ds,blind=TRUE)
plotPCA(ONT_top.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
plotPCA(Illumina_top.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
ONT_bottom.rlog <- rlog(ONT_bottom.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
Illumina_bottom.rlog <- rlog(Illumina_bottom.ds,blind=TRUE)
plotPCA(ONT_bottom.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
plotPCA(Illumina_bottom.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
design(ONT_bottom.ds) <- ~ condition
ONT_bottom.ds$condition %<>% relevel(ref="GC")
ONT_bottom.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_bottom.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_bottom.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_bottom.ds %<>% nbinomWaldTest()
ONT_bottom.results <- results(ONT_bottom.ds,independentFiltering = TRUE,alpha = 0.05)
design(Illumina_bottom.ds) <- ~ condition
Illumina_bottom.ds$condition %<>% relevel(ref="GC")
Illumina_bottom.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 5 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
Illumina_bottom.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
Illumina_bottom.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
Illumina_bottom.ds %<>% nbinomWaldTest()
Illumina_bottom.results <- results(Illumina_bottom.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_top.ds) <- ~ condition
ONT_top.ds$condition %<>% relevel(ref="GC")
ONT_top.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 8 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_top.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_top.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_top.ds %<>% nbinomWaldTest()
ONT_top.results <- results(ONT_top.ds,independentFiltering = TRUE,alpha = 0.05)
design(Illumina_top.ds) <- ~ condition
Illumina_top.ds$condition %<>% relevel(ref="GC")
Illumina_top.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 3 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
Illumina_top.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
Illumina_top.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
Illumina_top.ds %<>% nbinomWaldTest()
Illumina_top.results <- results(Illumina_top.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_median.ds) <- ~ condition
ONT_median.ds$condition %<>% relevel(ref="GC")
ONT_median.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_median.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_median.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_median.ds %<>% nbinomWaldTest()
ONT_median.results <- results(ONT_median.ds,independentFiltering = TRUE,alpha = 0.05)
design(Illumina_median.ds) <- ~ condition
Illumina_median.ds$condition %<>% relevel(ref="GC")
Illumina_median.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 5 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
Illumina_median.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
Illumina_median.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
Illumina_median.ds %<>% nbinomWaldTest()
Illumina_median.results <- results(Illumina_median.ds,independentFiltering = TRUE,alpha = 0.05)
ONT_median.results <- ONT_median.results %>% `[`(order(.$padj),)
head(ONT_median.results)
## log2 fold change (MLE): condition F vs GC
## Wald test p-value: condition F vs GC
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000029343 6.729932 -2.502332 0.709312 -3.52783 0.000418983
## ENSMUSG00000018752 11.886726 -0.434294 0.136538 -3.18075 0.001468967
## ENSMUSG00000039382 4.039128 0.745970 0.257303 2.89919 0.003741252
## ENSMUSG00000118663 1.238149 1.113364 0.427758 2.60279 0.009246902
## ENSMUSG00000030722 0.528138 1.569297 0.580396 2.70384 0.006854349
## ENSMUSG00000079048 2.664272 -0.798253 0.299546 -2.66488 0.007701679
## padj
## <numeric>
## ENSMUSG00000029343 0.209492
## ENSMUSG00000018752 0.367242
## ENSMUSG00000039382 0.623542
## ENSMUSG00000118663 0.660493
## ENSMUSG00000030722 0.660493
## ENSMUSG00000079048 0.660493
par(mfrow=c(1,2))
plotCounts(ONT_median.ds, gene="ENSMUSG00000029343", normalized = TRUE, xlab="")
plotCounts(ONT_median.ds, gene = which.max(ONT_median.results$padj), xlab="",
main = "Gene with max. p.adj.\n(=least significant)")
Illumina_median.results <- Illumina_median.results %>% `[`(order(.$padj),)
head(Illumina_median.results)
## log2 fold change (MLE): condition F vs GC
## Wald test p-value: condition F vs GC
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000018752 269.844 -0.589231 0.0944732 -6.23702 4.45987e-10
## ENSMUSG00000026034 1143.537 0.228240 0.0440113 5.18593 2.14936e-07
## ENSMUSG00000038777 171.765 -0.474810 0.0981574 -4.83723 1.31662e-06
## ENSMUSG00000099696 166.496 0.391034 0.0964097 4.05597 4.99276e-05
## ENSMUSG00000042109 1100.254 -0.192379 0.0474723 -4.05244 5.06854e-05
## ENSMUSG00000013833 534.025 -0.231484 0.0571884 -4.04775 5.17120e-05
## padj
## <numeric>
## ENSMUSG00000018752 1.23092e-07
## ENSMUSG00000026034 2.96612e-05
## ENSMUSG00000038777 1.21129e-04
## ENSMUSG00000099696 2.37875e-03
## ENSMUSG00000042109 2.37875e-03
## ENSMUSG00000013833 2.37875e-03
par(mfrow=c(1,2))
plotCounts(Illumina_median.ds, gene="ENSMUSG00000018752", normalized = TRUE, xlab="")
plotCounts(Illumina_median.ds, gene = which.max(Illumina_median.results$padj), xlab="",
main = "Gene with max. p.adj.\n(=least significant)")
# identify genes with the desired adjusted p-value cut-off
Illumina_median.DGEgenes <- rownames(subset(Illumina_median.results, padj < 0.05))
# extract rlog-transformed values into a matrix
Illumina_median_rlog.dge <- Illumina_median.rlog[Illumina_median.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(Illumina_median_rlog.dge, scale="row",
show_rownames=FALSE, main="Illumina DGE",
color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))
# identify genes with the desired adjusted p-value cut-off
Illumina_top.DGEgenes <- rownames(subset(Illumina_top.results, padj < 0.05))
# extract rlog-transformed values into a matrix
Illumina_top_rlog.dge <- Illumina_top.rlog[Illumina_top.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(Illumina_top_rlog.dge, scale="row",
show_rownames=FALSE, main="Illumina DGE",
color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))
# identify genes with the desired adjusted p-value cut-off
Illumina_bottom.DGEgenes <- rownames(subset(Illumina_bottom.results, padj < 0.05))
# extract rlog-transformed values into a matrix
Illumina_bottom_rlog.dge <- Illumina_bottom.rlog[Illumina_bottom.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
# pheatmap(Illumina_bottom_rlog.dge, scale="row",
# show_rownames=FALSE, main="Illumina DGE",
# color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))
changing the color scheme on pheatmap to see more contrast
# identify genes with the desired adjusted p-value cut-off
ONT_median.DGEgenes <- rownames(subset(ONT_median.results, pvalue<0.05))
# extract rlog-transformed values into a matrix
ONT_median_rlog.dge <- ONT_median.rlog[ONT_median.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_median_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
# identify genes with the desired adjusted p-value cut-off
ONT_top.DGEgenes <- rownames(subset(ONT_top.results, pvalue<0.05))
# extract rlog-transformed values into a matrix
ONT_top_rlog.dge <- ONT_top.rlog[ONT_top.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_top_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
# identify genes with the desired adjusted p-value cut-off
ONT_bottom.DGEgenes <- rownames(subset(ONT_bottom.results, pvalue<0.05))
# extract rlog-transformed values into a matrix
ONT_bottom_rlog.dge <- ONT_bottom.rlog[ONT_bottom.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_bottom_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
Since nanopore is not giving a great result with 500 genes. Let’s try
increasing the gene set size.
median_1000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$ab_diff)[1:1000],]
top_1000_genes <- gencode.vM33.All.GeneLengths.filtered[order(-gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:1000],]
bottom_1000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:1000],]
sample_genes <- list(
median500 = median_500_genes,
top500 = top_500_genes,
bottom500 = bottom_500_genes,
median1000 = median_1000_genes,
top1000 = top_1000_genes,
bottom1000 = bottom_1000_genes
)
ONT_median1000.ds <- ONT_DESeq_overlap.ds[row.names(median_1000_genes), ]
ONT_top1000.ds <- ONT_DESeq_overlap.ds[row.names(top_1000_genes), ]
ONT_bottom1000.ds <- ONT_DESeq_overlap.ds[row.names(bottom_1000_genes), ]
ONT_median1000.rlog <- rlog(ONT_median1000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_median1000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
ONT_top1000.rlog <- rlog(ONT_top1000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_top1000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
ONT_bottom1000.rlog <- rlog(ONT_bottom1000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
plotPCA(ONT_bottom1000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
design(ONT_median1000.ds) <- ~ condition
ONT_median1000.ds$condition %<>% relevel(ref="GC")
ONT_median1000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_median1000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_median1000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_median1000.ds %<>% nbinomWaldTest()
ONT_median1000.results <- results(ONT_median1000.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_top1000.ds) <- ~ condition
ONT_top1000.ds$condition %<>% relevel(ref="GC")
ONT_top1000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_top1000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_top1000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_top1000.ds %<>% nbinomWaldTest()
ONT_top1000.results <- results(ONT_top1000.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_bottom1000.ds) <- ~ condition
ONT_bottom1000.ds$condition %<>% relevel(ref="GC")
ONT_bottom1000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 5 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_bottom1000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_bottom1000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_bottom1000.ds %<>% nbinomWaldTest()
ONT_bottom1000.results <- results(ONT_bottom1000.ds,independentFiltering = TRUE,alpha = 0.05)
# identify genes with the desired adjusted p-value cut-off
ONT_median1000.DGEgenes <- rownames(subset(ONT_median1000.results, pvalue<0.05))
# extract rlog-transformed values into a matrix
ONT_median1000_rlog.dge <- ONT_median1000.rlog[ONT_median1000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_median1000_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
# identify genes with the desired adjusted p-value cut-off
ONT_top1000.DGEgenes <- rownames(subset(ONT_top1000.results, pvalue<0.05))
# extract rlog-transformed values into a matrix
ONT_top1000_rlog.dge <- ONT_top1000.rlog[ONT_top1000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_top1000_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
# identify genes with the desired adjusted p-value cut-off
ONT_bottom1000.DGEgenes <- rownames(subset(ONT_bottom1000.results, pvalue<0.05))
# extract rlog-transformed values into a matrix
ONT_bottom1000_rlog.dge <- ONT_bottom1000.rlog[ONT_bottom1000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_bottom1000_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
median_5000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$ab_diff)[1:5000],]
top_5000_genes <- gencode.vM33.All.GeneLengths.filtered[order(-gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:5000],]
bottom_5000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:5000],]
sample_genes <- list(
median500 = median_500_genes,
top500 = top_500_genes,
bottom500 = bottom_500_genes,
median1000 = median_1000_genes,
top1000 = top_1000_genes,
bottom1000 = bottom_1000_genes,
median5000 = median_5000_genes,
top5000 = top_5000_genes,
bottom5000 = bottom_5000_genes
)
ONT_median5000.ds <- ONT_DESeq_overlap.ds[row.names(median_5000_genes), ]
ONT_top5000.ds <- ONT_DESeq_overlap.ds[row.names(top_5000_genes), ]
ONT_bottom5000.ds <- ONT_DESeq_overlap.ds[row.names(bottom_5000_genes), ]
ONT_median5000.rlog <- rlog(ONT_median5000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_median5000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
ONT_top5000.rlog <- rlog(ONT_top5000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_top5000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
ONT_bottom5000.rlog <- rlog(ONT_bottom5000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
plotPCA(ONT_bottom5000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
design(ONT_median5000.ds) <- ~ condition
ONT_median5000.ds$condition %<>% relevel(ref="GC")
ONT_median5000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 5 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_median5000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_median5000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_median5000.ds %<>% nbinomWaldTest()
ONT_median5000.results <- results(ONT_median5000.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_top5000.ds) <- ~ condition
ONT_top5000.ds$condition %<>% relevel(ref="GC")
ONT_top5000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_top5000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_top5000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_top5000.ds %<>% nbinomWaldTest()
ONT_top5000.results <- results(ONT_top5000.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_bottom5000.ds) <- ~ condition
ONT_bottom5000.ds$condition %<>% relevel(ref="GC")
ONT_bottom5000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 9 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_bottom5000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_bottom5000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_bottom5000.ds %<>% nbinomWaldTest()
ONT_bottom5000.results <- results(ONT_bottom5000.ds,independentFiltering = TRUE,alpha = 0.05)
# identify genes with the desired adjusted p-value cut-off
ONT_median5000.DGEgenes <- rownames(subset(ONT_median5000.results, padj<0.05))
# extract rlog-transformed values into a matrix
ONT_median5000_rlog.dge <- ONT_median5000.rlog[ONT_median5000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_median5000_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
# identify genes with the desired adjusted p-value cut-off
ONT_top5000.DGEgenes <- rownames(subset(ONT_top5000.results, padj<0.05))
# extract rlog-transformed values into a matrix
ONT_top5000_rlog.dge <- ONT_top5000.rlog[ONT_top5000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_top5000_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
# identify genes with the desired adjusted p-value cut-off
ONT_bottom5000.DGEgenes <- rownames(subset(ONT_bottom5000.results, padj<0.05))
# extract rlog-transformed values into a matrix
ONT_bottom5000_rlog.dge <- ONT_bottom5000.rlog[ONT_bottom5000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_bottom5000_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
median_8000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$ab_diff)[1:8000],]
top_8000_genes <- gencode.vM33.All.GeneLengths.filtered[order(-gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:8000],]
bottom_8000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:8000],]
sample_genes <- list(
median500 = median_500_genes,
top500 = top_500_genes,
bottom500 = bottom_500_genes,
median1000 = median_1000_genes,
top1000 = top_1000_genes,
bottom1000 = bottom_1000_genes,
median5000 = median_5000_genes,
top5000 = top_5000_genes,
bottom5000 = bottom_5000_genes,
median8000 = median_8000_genes,
top8000 = top_8000_genes,
bottom8000 = bottom_8000_genes
)
ONT_median8000.ds <- ONT_DESeq_overlap.ds[row.names(median_8000_genes), ]
ONT_top8000.ds <- ONT_DESeq_overlap.ds[row.names(top_8000_genes), ]
ONT_bottom8000.ds <- ONT_DESeq_overlap.ds[row.names(bottom_8000_genes), ]
ONT_median8000.rlog <- rlog(ONT_median8000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_median8000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
ONT_top8000.rlog <- rlog(ONT_top8000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_top8000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
ONT_bottom8000.rlog <- rlog(ONT_bottom8000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
plotPCA(ONT_bottom8000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance
design(ONT_median8000.ds) <- ~ condition
ONT_median8000.ds$condition %<>% relevel(ref="GC")
ONT_median8000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 6 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_median8000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_median8000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_median8000.ds %<>% nbinomWaldTest()
ONT_median8000.results <- results(ONT_median8000.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_top8000.ds) <- ~ condition
ONT_top8000.ds$condition %<>% relevel(ref="GC")
ONT_top8000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_top8000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_top8000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_top8000.ds %<>% nbinomWaldTest()
ONT_top8000.results <- results(ONT_top8000.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_bottom8000.ds) <- ~ condition
ONT_bottom8000.ds$condition %<>% relevel(ref="GC")
ONT_bottom8000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 12 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_bottom8000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_bottom8000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
ONT_bottom8000.ds %<>% nbinomWaldTest()
ONT_bottom8000.results <- results(ONT_bottom8000.ds,independentFiltering = TRUE,alpha = 0.05)
# identify genes with the desired adjusted p-value cut-off
ONT_median8000.DGEgenes <- rownames(subset(ONT_median8000.results, padj<0.05))
# extract rlog-transformed values into a matrix
ONT_median8000_rlog.dge <- ONT_median8000.rlog[ONT_median8000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_median8000_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
# identify genes with the desired adjusted p-value cut-off
ONT_top8000.DGEgenes <- rownames(subset(ONT_top8000.results, padj<0.05))
# extract rlog-transformed values into a matrix
ONT_top8000_rlog.dge <- ONT_top8000.rlog[ONT_top8000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_top8000_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
# identify genes with the desired adjusted p-value cut-off
ONT_bottom8000.DGEgenes <- rownames(subset(ONT_bottom8000.results, padj<0.05))
# extract rlog-transformed values into a matrix
ONT_bottom8000_rlog.dge <- ONT_bottom8000.rlog[ONT_bottom8000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_bottom8000_rlog.dge, scale="row",
show_rownames=FALSE, main="ONT DGE",
color = colorRampPalette(c("blue", "red"))(100))
gencode.vM33.ONT.GeneLengths <- gencode.vM33.All.GeneLengths[row.names(ONT_DESeq.ds),]
gencode.vM33.ONT.GeneLengths.overlap <- gencode.vM33.All.GeneLengths.filtered[row.names(ONT_DESeq_overlap.ds),]
hist(gencode.vM33.ONT.GeneLengths$log2.GeneLength, freq = FALSE)
ONT_medians <- apply(counts(ONT_DESeq.ds), 1, median)
gencode.vM33.ONT.GeneLengths$medianCounts <- ONT_medians[match(rownames(gencode.vM33.ONT.GeneLengths), names(ONT_medians))]
gencode.vM33.ONT.GeneLengths.overlap$medianCounts <- ONT_medians[match(rownames(gencode.vM33.ONT.GeneLengths.overlap), names(ONT_medians))]
#Scatter plot of log(medianCounts) and log(geneLengths)
plot(log2(medianCounts) ~ log2.GeneLength, data = gencode.vM33.ONT.GeneLengths,
main = "Scatter Plot of log2(Median Counts) vs. log2(Gene Length)",
xlab = "log2(Gene Length)", ylab = "log2(Median Counts)", col = "black", pch = 16)
gencode.vM33.Illumina.GeneLengths <- gencode.vM33.All.GeneLengths[row.names(Illumina_DESeq.ds),]
gencode.vM33.Illumina.GeneLengths.overlap <- gencode.vM33.All.GeneLengths.filtered[row.names(Illumina_DESeq_overlap.ds),]
hist(gencode.vM33.Illumina.GeneLengths$log2.GeneLength, freq = FALSE)
Illumina_medians <- apply(counts(Illumina_DESeq.ds), 1, median)
gencode.vM33.Illumina.GeneLengths$medianCounts <- Illumina_medians[match(rownames(gencode.vM33.Illumina.GeneLengths), names(Illumina_medians))]
gencode.vM33.Illumina.GeneLengths.overlap$medianCounts <- Illumina_medians[match(rownames(gencode.vM33.Illumina.GeneLengths.overlap), names(Illumina_medians))]
#Scatter plot of log(medianCounts) and log(geneLengths)
plot(log2(medianCounts) ~ log2.GeneLength, data = gencode.vM33.Illumina.GeneLengths,
main = "Scatter Plot of log2(Median Counts) vs. log2(Gene Length)",
xlab = "log2(Gene Length)", ylab = "log2(Median Counts)", col = "black", pch = 16)
ONT_hist1 <- hist(gencode.vM33.ONT.GeneLengths$log2.GeneLength, freq = FALSE)
Illumina_hist1 <- hist(gencode.vM33.Illumina.GeneLengths$log2.GeneLength, freq = FALSE)
# Set the alpha (transparency) value, e.g., 0.5 for 50% transparency
transparency <- 0.5
# Create translucent colors using the rgb function
red_translucent <- rgb(1, 0, 0, alpha = transparency)
blue_translucent <- rgb(0, 0, 1, alpha = transparency)
# Find the overall range of y values
y_max <- max(c(ONT_hist1$counts, Illumina_hist1$counts))
# Plot histograms with translucent colors and adjusted y-axis limits
plot(ONT_hist1, col = red_translucent, ylim = c(0, y_max)) # first histogram
plot(Illumina_hist1, col = blue_translucent, add = TRUE) # second histogram
#increase transparency
transparency <- 0.3
# Create translucent colors using the rgb function
red_translucent <- rgb(1, 0, 0, alpha = transparency)
blue_translucent <- rgb(0, 0, 1, alpha = transparency)
# Scatter plot of log(medianCounts) and log(geneLengths) for ONT data
plot(log2(medianCounts) ~ log2.GeneLength, data = gencode.vM33.ONT.GeneLengths.overlap,
main = "Scatter Plot of log2(Median Counts) vs. log2(Gene Length)",
xlab = "log2(Gene Length)", ylab = "log2(Median Counts)", col = red_translucent, pch = 1,cex=0.1)
# Add points for Illumina data on the same plot
points(log2(medianCounts) ~ log2.GeneLength, data = gencode.vM33.Illumina.GeneLengths.overlap,
col = blue_translucent, pch = 2,cex=0.1)
Illumina overestimates/gives more read counts to longer genes.
###Fit distributions of the scatter plots.
#Combine the median gene counts from both platforms
gencode.vM33.Illumina.GeneLengths$log2.medianCounts <- log2(gencode.vM33.Illumina.GeneLengths$medianCounts + 1)
gencode.vM33.ONT.GeneLengths$log2.medianCounts <- log2(gencode.vM33.ONT.GeneLengths$medianCounts + 1)
gencode.vM33.Illumina.GeneLengths.overlap$log2.medianCounts <- log2(gencode.vM33.Illumina.GeneLengths.overlap$medianCounts + 1)
gencode.vM33.ONT.GeneLengths.overlap$log2.medianCounts <- log2(gencode.vM33.ONT.GeneLengths.overlap$medianCounts + 1)
gencode.vM33.Combined.GeneLengths <- cbind(gencode.vM33.Illumina.GeneLengths.overlap,gencode.vM33.ONT.GeneLengths.overlap[,5:6])
selected_columns1 <- 5:6
prefix1 <- "Illumina_"
colnames(gencode.vM33.Combined.GeneLengths)[selected_columns1] <- paste0(prefix1,colnames(gencode.vM33.Illumina.GeneLengths.overlap)[selected_columns1])
selected_columns2 <- 7:8
prefix2 <- "ONT_"
colnames(gencode.vM33.Combined.GeneLengths)[selected_columns2] <- paste0(prefix2,colnames(gencode.vM33.ONT.GeneLengths.overlap)[selected_columns1])
hist(gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts,freq = FALSE)
lines(density(gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts))
hist(gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts, freq = FALSE)
lines(density(gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts))
par(mar = c(5, 5, 2, 2))
descdist(gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts, discrete = FALSE)
## summary statistics
## ------
## min: 0 max: 19.29066
## median: 5.658211
## mean: 5.414033
## estimated sd: 3.488715
## estimated skewness: -0.001768405
## estimated kurtosis: 1.949425
descdist(gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts, discrete = FALSE)
## summary statistics
## ------
## min: 0 max: 13.17344
## median: 0.5849625
## mean: 1.503343
## estimated sd: 1.878414
## estimated skewness: 1.169843
## estimated kurtosis: 3.916078
cor(gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts, gencode.vM33.Combined.GeneLengths$log2.GeneLength, method='pearson')
## [1] 0.4780111
cor(gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts, gencode.vM33.Combined.GeneLengths$log2.GeneLength, method='pearson')
## [1] 0.2076635
Since the correlations are not linear for ONT, I will try to assess the correlation assuming the data is non-linearly correlated.
nlcor(gencode.vM33.Combined.GeneLengths$log2.GeneLength,gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts,plt=T)
## $cor.estimate
## [1] 0.4780111
##
## $adjusted.p.value
## [1] 0
##
## $cor.plot
nlcor(gencode.vM33.Combined.GeneLengths$log2.GeneLength,gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts,plt=T)
## $cor.estimate
## [1] 0.2076635
##
## $adjusted.p.value
## [1] 0
##
## $cor.plot
There is negligible correlation between nanopore sequencing gene counts and gene length.